import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('/Users/gabrielegatulyte/Desktop/NetflixOriginals.csv')
data.head(10)
| Title | Genre | Premiere | Runtime | IMDB Score | Language | |
|---|---|---|---|---|---|---|
| 0 | Enter the Anime | Documentary | August 5, 2019 | 58 | 2.5 | English/Japanese |
| 1 | Dark Forces | Thriller | August 21, 2020 | 81 | 2.6 | Spanish |
| 2 | The App | Science fiction/Drama | December 26, 2019 | 79 | 2.6 | Italian |
| 3 | The Open House | Horror thriller | January 19, 2018 | 94 | 3.2 | English |
| 4 | Kaali Khuhi | Mystery | October 30, 2020 | 90 | 3.4 | Hindi |
| 5 | Drive | Action | November 1, 2019 | 147 | 3.5 | Hindi |
| 6 | Leyla Everlasting | Comedy | December 4, 2020 | 112 | 3.7 | Turkish |
| 7 | The Last Days of American Crime | Heist film/Thriller | June 5, 2020 | 149 | 3.7 | English |
| 8 | Paradox | Musical/Western/Fantasy | March 23, 2018 | 73 | 3.9 | English |
| 9 | Sardar Ka Grandson | Comedy | May 18, 2021 | 139 | 4.1 | Hindi |
data.shape
(584, 6)
data.isnull().sum()
Title 0 Genre 0 Premiere 0 Runtime 0 IMDB Score 0 Language 0 dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 584 entries, 0 to 583 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 584 non-null object 1 Genre 584 non-null object 2 Premiere 584 non-null object 3 Runtime 584 non-null int64 4 IMDB Score 584 non-null float64 5 Language 584 non-null object dtypes: float64(1), int64(1), object(4) memory usage: 27.5+ KB
data['Premiere'] = pd.to_datetime(data['Premiere'])
data['Year'] = data['Premiere'].dt.year
data['Month'] = data['Premiere'].dt.month
data['Day'] = data['Premiere'].dt.day
data['Day_of_week']=data['Premiere'].dt.dayofweek
pagal_metus = data['Year'].value_counts()
fig, ax = plt.subplots()
plt.bar(pagal_metus.index, pagal_metus.values, alpha = 0.7, color="palevioletred")
plt.style.use(style="fast")
ax.set_ylabel("Filmų skaičius")
plt.xticks(rotation=90)
plt.title('Išleistų filmų skaičius 2014-2021m.', fontsize=10);
fig.tight_layout()
plt.show()
pagal_menesi = data['Month'].value_counts().sort_index()
menesiai = ('Sausis', 'Vasaris', 'Kovas', 'Balandis', 'Gegužė', 'Birželis',
'Liepa', 'Rugpjūtis', 'Rugsėjis', 'Spalis', 'Lapkritis', ' Gruodis')
fig, ax = plt.subplots()
plt.bar(menesiai, pagal_menesi.values, alpha = 0.7, color="palevioletred")
plt.style.use(style="fast")
ax.set_ylabel("Filmų skaičius")
plt.xticks(rotation=90)
plt.title('Išleistų filmų skaičius pagal mėnesius', fontsize=10);
fig.tight_layout()
plt.show()
pagal_diena = data['Day_of_week'].value_counts().sort_index()
dienos = ('Pirmadienis', 'Antradienis', 'Trečiadienis', 'Ketvirtadienis',
'Penktadienis', 'Šeštadienis', 'Sekmadienis')
fig, ax = plt.subplots()
plt.bar(dienos, pagal_diena.values, alpha = 0.7, color="palevioletred")
plt.style.use(style="fast")
ax.set_ylabel("Filmų skaičius")
plt.xticks(rotation=90)
plt.title('Išleistų filmų skaičius pagal dienas', fontsize=10);
fig.tight_layout()
plt.show()
data['Genre'].nunique()
115
data['Genre'].value_counts()
Documentary 159
Drama 77
Comedy 49
Romantic comedy 39
Thriller 33
...
Action-adventure 1
Christmas/Fantasy/Adventure/Comedy 1
Science fiction/Action 1
Hidden-camera prank comedy 1
Science fiction adventure 1
Name: Genre, Length: 115, dtype: int64
zanras = data['Genre'].value_counts()[:10]
fig, ax = plt.subplots()
plt.bar(zanras.index, zanras, alpha = 0.7, color="palevioletred")
plt.style.use(style="classic")
ax.set_ylabel("Filmų skaičius")
plt.xticks(rotation=90)
plt.title('Žanrų top 10', fontsize=10);
fig.tight_layout()
plt.show()
data['Language'].nunique()
top_10_kalbu = data['Language'].value_counts()[:10]
fig, ax = plt.subplots()
plt.bar(top_10_kalbu.index, top_10_kalbu, alpha = 0.7, color="palevioletred")
plt.style.use(style="classic")
plt.xticks(rotation=90)
plt.title('Top 10 kalbų', fontsize=10);
fig.tight_layout()
plt.show()
data['Runtime'].mean()
93.57705479452055
data[data.Runtime == data.Runtime.max()][["Title", "Runtime"]]
| Title | Runtime | |
|---|---|---|
| 561 | The Irishman | 209 |
data[data.Runtime == data.Runtime.min()][["Title", "Runtime"]]
| Title | Runtime | |
|---|---|---|
| 40 | Sol Levante | 4 |
data['Runtime'].plot(kind='hist',bins=10,figsize=(5,5),color='palevioletred')
plt.style.use(style="classic")
plt.show()
data['IMDB Score'].describe()
count 584.000000 mean 6.271747 std 0.979256 min 2.500000 25% 5.700000 50% 6.350000 75% 7.000000 max 9.000000 Name: IMDB Score, dtype: float64
data[data["IMDB Score"] == data["IMDB Score"].max()][["Title", "Genre", "IMDB Score"]]
| Title | Genre | IMDB Score | |
|---|---|---|---|
| 583 | David Attenborough: A Life on Our Planet | Documentary | 9.0 |
data[data["IMDB Score"] == data["IMDB Score"].min()][["Title", "Genre", "IMDB Score"]]
| Title | Genre | IMDB Score | |
|---|---|---|---|
| 0 | Enter the Anime | Documentary | 2.5 |
linijinis = data.groupby('Year')['IMDB Score'].mean()
linijinis
Year 2014 6.400000 2015 6.877778 2016 6.513333 2017 6.422727 2018 6.360606 2019 6.259200 2020 6.195082 2021 6.046479 Name: IMDB Score, dtype: float64
plt.figure(figsize=(10,4))
sns.lineplot(x='Year',y='IMDB Score',data=data, ci=None, color='palevioletred')
plt.style.use(style="classic")
fig.tight_layout()
plt.show()
sns.regplot(data=data,x='IMDB Score',y='Runtime',color='palevioletred')
plt.style.use(style="classic")
plt.title('Koreliacija tarp filmo trukmės ir IMDB', fontsize=10)
Text(0.5, 1.0, 'Koreliacija tarp filmo trukmės ir IMDB')